0 Data Preparation

	hadoop fs -mkdir /root/labs
	hadoop fs -mkdir /root/labs/datasets
	hadoop fs -mkdir /root/labs/datasets/labs
	hadoop fs -mkdir /root/labs/datasets/nlp
	hadoop fs -mkdir /root/labs/datasets/labs/mllib
	hadoop fs -put /root/TrainingOnHDP/dataset/spark/mllib/sample_libsvm_data.txt /root/labs/datasets/labs/mllib/sample_libsvm_data.txt
	hadoop fs -put /root/TrainingOnHDP/dataset/spark/mllib/ridge-data/lpsa.data /root/labs/datasets/labs/mllib/ridge-data/lpsa.data
	hadoop fs -put /root/TrainingOnHDP/dataset/spark/mllib/sample_naive_bayes_data.txt /root/labs/datasets/labs/mllib/sample_naive_bayes_data.txt
	hadoop fs -put /root/TrainingOnHDP/dataset/spark/mllib/als/test.data /root/labs/datasets/labs/mllib/als/test.data
	hadoop fs -put /root/TrainingOnHDP/dataset/spark/mllib/kmeans_data.txt /root/labs/datasets/labs/mllib/kmeans_data.txt
	hadoop fs -put /root/TrainingOnHDP/dataset/spark/mllib/sample_fpgrowth.txt /root/labs/datasets/labs/mllib/sample_fpgrowth.txt
	hadoop fs -put /root/TrainingOnHDP/dataset/spark/mllib/sample_binary_classification_data.txt /root/labs/datasets/labs/mllib/sample_binary_classification_data.txt
	hadoop fs -put /root/TrainingOnHDP/dataset/spark/nlp/country-lyrics.json /root/labs/datasets/nlp/country-lyrics.json
	
	wget http://stat-computing.org/dataexpo/2009/2007.csv.bz2 -O /tmp/flights_2007.csv.bz2
	wget http://stat-computing.org/dataexpo/2009/2008.csv.bz2 -O /tmp/flights_2008.csv.bz2
	wget ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2007.csv.gz -O /tmp/weather_2007.csv.gz
	wget ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2008.csv.gz -O /tmp/weather_2008.csv.gz	
	
	hadoop fs -put /tmp/flights_2007.csv.bz2 /tmp/airflightsdelays/flights_2007.csv.bz2
	hadoop fs -put /tmp/flights_2008.csv.bz2 /tmp/airflightsdelays/flights_2008.csv.bz2
	hadoop fs -put /tmp/weather_2007.csv.gz /tmp/airflightsdelays/weather_2007.csv.gz
	hadoop fs -put /tmp/weather_2008.csv.gz /tmp/airflightsdelays/weather_2008.csv.gz
	
	wget https://github.com/felixcheung/spark-notebook-examples/blob/master/data/kddcupsmall?raw=true -O /tmp/kddcupsmall.txt
	hadoop fs -put /tmp/weather_2008.csv.gz /tmp/kddcupsmall.txt /root/labs/datasets/kddcupsmall.txt
	
	
1 Data Types – Local Vector

	1.1 Example of Dense and Sparse Vector:
	
		import org.apache.spark.mllib.linalg.{Vector, Vectors}
		
		val dv: Vector = Vectors.dense(1.1, 0.0, 0.0, 4.4, 0.0, 5,5)
		val sv1: Vector = Vectors.sparse(6, Array(0, 3, 5), Array(1.1, 4.4, 5.5))
		val sv2: Vector = Vectors.sparse(6, Seq((0, 1.1), (3, 4.4), (5, 5.5)))

2. Data Types – Labeled point

	2.1 Example:
	
		import org.apache.spark.mllib.linalg.Vectors
		import org.apache.spark.mllib.regression.LabeledPoint
		val pos = LabeledPoint(1.0, Vectors.dense(1.1, 0.0, 0.0, 4.4, 0.0, 5,5))
		val neg = LabeledPoint(0.0, Vectors.sparse(6, Array(0, 3, 5), Array(1.1, 4.4, 5.5)))
		
	2.2 Example:

		import org.apache.spark.mllib.regression.LabeledPoint
		import org.apache.spark.mllib.util.MLUtils
		import org.apache.spark.rdd.RDD

		val examples: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "/root/labs/datasets/labs/mllib/sample_libsvm_data.txt")
		
3. Data Types – Local Matrix

	3.1 Example:

		import org.apache.spark.mllib.linalg.{Matrix, Matrices}
		val dm: Matrix = Matrices.dense(3, 3, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0, 3.0, 5.0, 7.0))
		val sm: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(9, 6, 8))	
		
4. Data Types – Row Matrix

	4.1 Example:

		import org.apache.spark.mllib.linalg.Vector
		import org.apache.spark.mllib.linalg.distributed.RowMatrix
		val denseData = Seq(Vectors.dense(0.0, 1.0, 2.0),Vectors.dense(3.0, 4.0, 5.0),Vectors.dense(6.0, 7.0, 8.0),Vectors.dense(9.0, 0.0, 1.0))
		val rows: RDD[Vector] = sc.parallelize(denseData, 2)
		val mat: RowMatrix = new RowMatrix(rows)
		val m = mat.numRows()
		val n = mat.numCols()
		// QR decomposition 
		val qrResult = mat.tallSkinnyQR(true)
		val G = mat.computeGramianMatrix()
		

5. Data Types – Indexed Row Matrix

	5.1 Examples:
	
		import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, RowMatrix}
		val data = Seq((0L, Vectors.dense(0.0, 1.0, 2.0)),(1L, Vectors.dense(3.0, 4.0, 5.0)),(3L, Vectors.dense(9.0, 0.0, 1.0))).map(x => IndexedRow(x._1, x._2))
		var indexedRows: RDD[IndexedRow] = sc.parallelize(data, 2)
		val mat: IndexedRowMatrix = new IndexedRowMatrix(indexedRows)
		val m = mat.numRows()
		val n = mat.numCols()
		// Drop its row indices.
		val rowMat: RowMatrix = mat.toRowMatrix()
		val coordMat = mat.toCoordinateMatrix()
		val blockMat = mat.toBlockMatrix(2, 2)
		
6. Data Types – Coordinate Matrix

	6.1 Example
	
		import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry}
		val entries = sc.parallelize(Seq((0, 0, 1.0),(0, 1, 2.0),(1, 1, 3.0),(1, 2, 4.0),(2, 2, 5.0),(2, 3, 6.0),(3, 0, 7.0),(3, 3, 8.0),(4, 1, 9.0)), 3).map { case (i, j, value) => MatrixEntry(i, j, value)}
		val mat: CoordinateMatrix = new CoordinateMatrix(entries)
		val m = mat.numRows()
		val n = mat.numCols()
		// Convert it to an IndexRowMatrix whose rows are sparse vectors.
		val indexedRowMatrix = mat.toIndexedRowMatrix()
		val blockMat = mat.toBlockMatrix(2, 2)
		
7. Word2Vec

	7.1 Example

		import org.apache.spark._
		import org.apache.spark.rdd._
		import org.apache.spark.SparkContext._
		import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel}	
		
		val sentence = "a b " * 100 + "a c " * 10
		val localDoc = Seq(sentence, sentence)
		val doc = sc.parallelize(localDoc).map(line => line.split(" ").toSeq)
		val model = new Word2Vec().setVectorSize(10).setSeed(42L).fit(doc)
		val synonyms = model.findSynonyms("a", 2)
		for((synonym, cosineSimilarity) <- synonyms) {println(s"$synonym $cosineSimilarity")}
		// Save and load model
		model.save(sc, "myWord2VecModelPath")
		val sameModel = Word2VecModel.load(sc, "myWord2VecModelPath")
		
		val word2VecMap = Map(("china", Array(0.50f, 0.50f, 0.50f, 0.50f)), ("japan", Array(0.40f, 0.50f, 0.50f, 0.50f)), ("taiwan", Array(0.60f, 0.50f, 0.50f, 0.50f)), ("korea", Array(0.45f, 0.60f, 0.60f, 0.60f)))
		val model = new Word2VecModel(word2VecMap)
		val syms = model.findSynonyms("china", 2)	

8. StandardScaler

	8.1 Example

		import org.apache.spark.SparkContext._
		import org.apache.spark.mllib.feature.StandardScaler
		import org.apache.spark.mllib.linalg.Vectors
		import org.apache.spark.mllib.util.MLUtils
		import org.apache.spark.mllib.feature.StandardScalerModel

		val data = MLUtils.loadLibSVMFile(sc, "/root/labs/datasets/labs/mllib/sample_libsvm_data.txt")
		val scaler1 = new StandardScaler().fit(data.map(x => x.features))
		val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features))
		val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean)
		val data1 = data.map(x => (x.label, scaler1.transform(x.features)))
		val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray))))
		
9. Normalizer

	9.1 Example

		import org.apache.spark.SparkContext._
		import org.apache.spark.mllib.feature.Normalizer
		import org.apache.spark.mllib.linalg.Vectors
		import org.apache.spark.mllib.util.MLUtils
		val data = MLUtils.loadLibSVMFile(sc, "/root/labs/datasets/labs/mllib/sample_libsvm_data.txt")
		val normalizer1 = new Normalizer()
		val normalizer2 = new Normalizer(p = Double.PositiveInfinity)
		val data1 = data.map(x => (x.label, normalizer1.transform(x.features)))
		val data2 = data.map(x => (x.label, normalizer2.transform(x.features)))	

10. SVM

	10.1 Example

		import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
		import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
		import org.apache.spark.mllib.util.MLUtils
		import org.apache.spark.mllib.optimization.L1Updater

		val data = MLUtils.loadLibSVMFile(sc, "/root/labs/datasets/labs/mllib/sample_libsvm_data.txt")
		val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
		val training = splits(0).cache()
		val test = splits(1)
		val numIterations = 100
		val model = SVMWithSGD.train(training, numIterations)
		model.clearThreshold()
		val scoreAndLabels = test.map { point => val score = model.predict(point.features); (score, point.label)}
		val metrics = new BinaryClassificationMetrics(scoreAndLabels)
		val auROC = metrics.areaUnderROC()
		println("Area under ROC = " + auROC)
		model.save(sc, "mySVMModelPath")
		val sameModel = SVMModel.load(sc, "mySVMModelPath")
		
		val svmAlg = new SVMWithSGD()
		svmAlg.optimizer.setNumIterations(200).setRegParam(0.1).setUpdater(new L1Updater)
		val modelL1 = svmAlg.run(training)
		
11. Logistic regression

	11.1 Example

		import org.apache.spark.SparkContext
		import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel}
		import org.apache.spark.mllib.evaluation.MulticlassMetrics
		import org.apache.spark.mllib.regression.LabeledPoint
		import org.apache.spark.mllib.linalg.Vectors
		import org.apache.spark.mllib.util.MLUtils

		val data = MLUtils.loadLibSVMFile(sc, "/root/labs/datasets/labs/mllib/sample_libsvm_data.txt")
		val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
		val traiing = splits(0).cache()
		val test = splits(1)
		val model = new LogisticRegressionWithLBFGS().setNumClasses(10).run(training)
		val predictionAndLabels = test.map { case LabeledPoint(label, features) => val prediction = model.predict(features); (prediction, label)}

		val metrics = new MulticlassMetrics(predictionAndLabels)
		val precision = metrics.precision
		println("Precision = " + precision)

		model.save(sc, "myLRWithBFGSModelPath")
		val samemodel = LogisticRegressionModel.load(sc, "myLRWithBFGSModelPath")	
		
12. Linear least squares regression

	12.1 Example

		import org.apache.spark.mllib.regression.LabeledPoint
		import org.apache.spark.mllib.regression.LinearRegressionModel
		import org.apache.spark.mllib.regression.LinearRegressionWithSGD
		import org.apache.spark.mllib.linalg.Vectors

		val data = sc.textFile("/root/labs/datasets/labs/mllib/ridge-data/lpsa.data")
		val parsedData = data.map { line => val parts = line.split(','); LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))}.cache()
		val numIterations = 100
		val model = LinearRegressionWithSGD.train(parsedData, numIterations)
		val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features); (point.label, prediction)}
		val MSE = valuesAndPreds.map{case(v, p) => math.pow((v - p), 2)}.mean()
		println("training Mean Squared Error = " + MSE)
		// Save and load model
		model.save(sc, "myLRWithSGDModelPath")
		val sameModel = LinearRegressionModel.load(sc, "myLRWithSGDModelPath")	
		

13. Linear lasso regression

	13.1 Example

		import org.apache.spark.mllib.regression.LabeledPoint
		import org.apache.spark.mllib.regression.LassoModel
		import org.apache.spark.mllib.regression.LassoWithSGD
		import org.apache.spark.mllib.linalg.Vectors

		val data = sc.textFile("/root/labs/datasets/labs/mllib/ridge-data/lpsa.data")
		val parsedData = data.map { line => val parts = line.split(','); LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))}.cache()
		val numIterations = 100
		val model = LassoWithSGD.train(parsedData, numIterations)
		val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features); (point.label, prediction)}
		val MSE = valuesAndPreds.map{case(v, p) => math.pow((v - p), 2)}.mean()
		println("training Mean Squared Error = " + MSE)
		// Save and load model
		model.save(sc, "myLassoWithSGDModelPath")
		val sameModel = LassoModel.load(sc, "myLassoWithSGDModelPath")	

14. Linear ridge regression

	14.1 Example

		import org.apache.spark.mllib.regression.LabeledPoint
		import org.apache.spark.mllib.regression.RidgeRegressionModel
		import org.apache.spark.mllib.regression.RidgeRegressionWithSGD
		import org.apache.spark.mllib.linalg.Vectors

		val data = sc.textFile("/root/labs/datasets/labs/mllib/ridge-data/lpsa.data")
		val parsedData = data.map { line => val parts = line.split(','); LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))}.cache()
		val numIterations = 100
		val model = RidgeRegressionWithSGD.train(parsedData, numIterations)
		val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features); (point.label, prediction)}
		val MSE = valuesAndPreds.map{case(v, p) => math.pow((v - p), 2)}.mean()
		println("training Mean Squared Error = " + MSE)
		// Save and load model
		model.save(sc, "myRRWithSGDModelPath")
		val sameModel = RidgeRegressionModel.load(sc, "myRRWithSGDModelPath")	

15. Decision Tree - Classfication

	15.1 Example		
	
		import org.apache.spark.mllib.tree.DecisionTree
		import org.apache.spark.mllib.tree.model.DecisionTreeModel
		import org.apache.spark.mllib.util.MLUtils

		val data = MLUtils.loadLibSVMFile(sc, "/root/labs/datasets/labs/mllib/sample_libsvm_data.txt")
		val splits = data.randomSplit(Array(0.7, 0.3))
		val (trainingData, testData) = (splits(0), splits(1))

		val numClasses = 2
		val categoricalFeaturesInfo = Map[Int, Int]()
		val impurity = "gini"
		val maxDepth = 5
		val maxBins = 32

		val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins)

		val labelAndPreds = testData.map { point => val prediction = model.predict(point.features); (point.label, prediction)}
		val testErr = labelAndPreds.filter(r => r._1 != r._2).count().toDouble / testData.count()
		println("Test Error = " + testErr)
		println("Learned classification tree model:\n" + model.toDebugString)

		// Save and load model
		model.save(sc, "myDecisionTreeClassificationModel")
		val sameModel = DecisionTreeModel.load(sc, "myDecisionTreeClassificationModel")
		
	
16. Decision Tree - Regression

	16.1 Example			
	
		import org.apache.spark.mllib.tree.DecisionTree
		import org.apache.spark.mllib.tree.model.DecisionTreeModel
		import org.apache.spark.mllib.util.MLUtils

		val data = MLUtils.loadLibSVMFile(sc, "/root/labs/datasets/labs/mllib/sample_libsvm_data.txt")
		val splits = data.randomSplit(Array(0.7, 0.3))
		val (trainingData, testData) = (splits(0), splits(1))

		val categoricalFeaturesInfo = Map[Int, Int]()
		val impurity = "variance"
		val maxDepth = 5
		val maxBins = 32

		val model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, impurity, maxDepth, maxBins)

		val labelsAndPredictions = testData.map { point => val prediction = model.predict(point.features); (point.label, prediction)}
		val testMSE = labelsAndPredictions.map{ case (v, p) => math.pow(v - p, 2) }.mean()
		println("Test Mean Squared Error = " + testMSE)
		println("Learned regression tree model:\n" + model.toDebugString)

		// Save and load model
		model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
		val sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeRegressionModel")
		
17. Random Forest - Classfication

	17.1 Example		

		import org.apache.spark.mllib.tree.RandomForest
		import org.apache.spark.mllib.tree.model.RandomForestModel
		import org.apache.spark.mllib.util.MLUtils

		val data = MLUtils.loadLibSVMFile(sc, "/root/labs/datasets/labs/mllib/sample_libsvm_data.txt")
		val splits = data.randomSplit(Array(0.7, 0.3))
		val (trainingData, testData) = (splits(0), splits(1))

		val numClasses = 2
		val categoricalFeaturesInfo = Map[Int, Int]()
		val numTrees = 3
		val featureSubsetStrategy = "auto"
		val impurity = "gini"
		val maxDepth = 4
		val maxBins = 32

		val model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)

		val labelAndPreds = testData.map { point => val prediction = model.predict(point.features); (point.label, prediction)}
		val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
		println("Test Error = " + testErr)
		println("Learned classification forest model:\n" + model.toDebugString)

		model.save(sc, "myRandomForestClassificationModel")
		val sameModel = RandomForestModel.load(sc, "myRandomForestClassificationModel")
	
17. Random Forest - Regression

	17.1 Example			
	
		import org.apache.spark.mllib.tree.RandomForest
		import org.apache.spark.mllib.tree.model.RandomForestModel
		import org.apache.spark.mllib.util.MLUtils

		val data = MLUtils.loadLibSVMFile(sc, "/root/labs/datasets/labs/mllib/sample_libsvm_data.txt")
		val splits = data.randomSplit(Array(0.7, 0.3))
		val (trainingData, testData) = (splits(0), splits(1))

		val numClasses = 2
		val categoricalFeaturesInfo = Map[Int, Int]()
		val numTrees = 3 
		val featureSubsetStrategy = "auto"
		val impurity = "variance"
		val maxDepth = 4
		val maxBins = 32

		val model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)

		val labelsAndPredictions = testData.map { point => val prediction = model.predict(point.features);(point.label, prediction)}
		val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean()
		println("Test Mean Squared Error = " + testMSE)
		println("Learned regression forest model:\n" + model.toDebugString)

		model.save(sc, "myRandomForestRegressionModel")
		val sameModel = RandomForestModel.load(sc, "myRandomForestRegressionModel")	
		
18. Gradient-boosted trees - Classfication
	
	18.1 Example

		import org.apache.spark.mllib.tree.GradientBoostedTrees
		import org.apache.spark.mllib.tree.configuration.BoostingStrategy
		import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
		import org.apache.spark.mllib.util.MLUtils

		val data = MLUtils.loadLibSVMFile(sc, "/root/labs/datasets/labs/mllib/sample_libsvm_data.txt")
		val splits = data.randomSplit(Array(0.7, 0.3))
		val (trainingData, testData) = (splits(0), splits(1))

		val boostingStrategy = BoostingStrategy.defaultParams("Classification")
		boostingStrategy.numIterations = 3 
		boostingStrategy.treeStrategy.numClasses = 2
		boostingStrategy.treeStrategy.maxDepth = 5
		boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()

		val model = GradientBoostedTrees.train(trainingData, boostingStrategy)

		val labelAndPreds = testData.map { point =>	val prediction = model.predict(point.features); (point.label, prediction)}
		val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
		println("Test Error = " + testErr)
		println("Learned classification GBT model:\n" + model.toDebugString)

		model.save(sc, "myGradientBoostingClassificationModel")
		val sameModel = GradientBoostedTreesModel.load(sc, "myGradientBoostingClassificationModel")
	

19. Gradient-boosted trees - Regression
	
	19.1 Example
	
		import org.apache.spark.mllib.tree.GradientBoostedTrees
		import org.apache.spark.mllib.tree.configuration.BoostingStrategy
		import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
		import org.apache.spark.mllib.util.MLUtils

		val data = MLUtils.loadLibSVMFile(sc, "/root/labs/datasets/labs/mllib/sample_libsvm_data.txt")
		val splits = data.randomSplit(Array(0.7, 0.3))
		val (trainingData, testData) = (splits(0), splits(1))

		val boostingStrategy = BoostingStrategy.defaultParams("Regression")
		boostingStrategy.numIterations = 3 
		boostingStrategy.treeStrategy.maxDepth = 5
		boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()

		val model = GradientBoostedTrees.train(trainingData, boostingStrategy)
		val labelsAndPredictions = testData.map { point => val prediction = model.predict(point.features); (point.label, prediction)}
		val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean()
		println("Test Mean Squared Error = " + testMSE)
		println("Learned regression GBT model:\n" + model.toDebugString)

		model.save(sc, "myGradientBoostingRegressionModel")
		val sameModel = GradientBoostedTreesModel.load(sc, "myGradientBoostingRegressionModel")
		
20. Naive Bayes

	20.1 Example

		import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
		import org.apache.spark.mllib.linalg.Vectors
		import org.apache.spark.mllib.regression.LabeledPoint

		val data = sc.textFile("/root/labs/datasets/labs/mllib/sample_naive_bayes_data.txt")
		val parsedData = data.map { line => val parts = line.split(','); LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))}

		val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
		val training = splits(0)
		val test = splits(1)

		val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial")

		val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
		val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()

		model.save(sc, "myNaiveBayesModel")
		val sameModel = NaiveBayesModel.load(sc, "myNaiveBayesModel")	
		
21. Collaborative Filtering

	21.1 Example
	
		import org.apache.spark.mllib.recommendation.ALS
		import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
		import org.apache.spark.mllib.recommendation.Rating

		val data = sc.textFile("/root/labs/datasets/labs/mllib/als/test.data")
		val ratings = data.map(_.split(',') match { case Array(user, item, rate) => Rating(user.toInt, item.toInt, rate.toDouble)})

		val rank = 10
		val numIterations = 10
		val model = ALS.train(ratings, rank, numIterations, 0.01)

		val usersProducts = ratings.map { case Rating(user, product, rate) => (user, product)}
		val predictions =  model.predict(usersProducts).map { case Rating(user, product, rate) => ((user, product), rate)}
		val ratesAndPreds = ratings.map { case Rating(user, product, rate) => ((user, product), rate)}.join(predictions)
		
		model.save(sc, "myCollaborativeFilter")
		val sameModel = MatrixFactorizationModel.load(sc, "myCollaborativeFilter")
		
		val alpha = 0.01
		val lambda = 0.01
		val model = ALS.trainImplicit(ratings, rank, numIterations, lambda, alpha)
		
22. K-means

	22.1 Example

		import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
		import org.apache.spark.mllib.linalg.Vectors

		val data = sc.textFile("/root/labs/datasets/labs/mllib/kmeans_data.txt")
		val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache()

		val numClusters = 2
		val numIterations = 20
		val clusters = KMeans.train(parsedData, numClusters, numIterations)

		val WSSSE = clusters.computeCost(parsedData)
		println("Within Set Sum of Squared Errors = " + WSSSE)

		clusters.save(sc, "myKmeansModelPath")
		val sameModel = KMeansModel.load(sc, "myKmeansModelPath")	
		

23. Power iteration clustering (PIC)

	23.1 Example

		import org.apache.spark.mllib.clustering.{PowerIterationClustering, PowerIterationClusteringModel}
		import org.apache.spark.mllib.linalg.Vectors

		val data = sc.textFile("/root/labs/datasets/labs/mllib/pic_data.txt")
		val similarities = data.map { line => val parts = line.split(' '); (parts(0).toLong, parts(1).toLong, parts(2).toDouble)}

		val pic = new PowerIterationClustering().setK(2).setMaxIterations(10)
		val model = pic.run(similarities)

		model.assignments.foreach { a =>println(s"${a.id} -> ${a.cluster}")}

		model.save(sc, "myPICModelPath")
		val sameModel = PowerIterationClusteringModel.load(sc, "myPICModelPath")
		
24. Latent Dirichlet allocation (LDA)

	24.1 Example

		import org.apache.spark.mllib.clustering.{LDA, DistributedLDAModel}
		import org.apache.spark.mllib.linalg.Vectors

		val data = sc.textFile("/root/labs/datasets/labs/mllib/sample_lda_data.txt")
		val parsedData = data.map(s => Vectors.dense(s.trim.split(' ').map(_.toDouble)))
		val corpus = parsedData.zipWithIndex.map(_.swap).cache()

		val ldaModel = new LDA().setK(3).run(corpus)

		println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize + " words):")
		val topics = ldaModel.topicsMatrix
		for (topic <- Range(0, 3)) { print("Topic " + topic + ":"); for (word <- Range(0, ldaModel.vocabSize)) { print(" " + topics(word, topic)); }; println()}

		ldaModel.save(sc, "myLDAModel")
		val sameModel = DistributedLDAModel.load(sc, "myLDAModel")	
		
25. FP-growth

	25.1 Example
	
		import org.apache.spark.mllib.fpm.FPGrowth
		import org.apache.spark.rdd.RDD
		val data = sc.textFile("/root/labs/datasets/labs/mllib/sample_fpgrowth.txt")
		val transactions: RDD[Array[String]] = data.map(s => s.trim.split(' '))
		val fpg = new FPGrowth().setMinSupport(0.2).setNumPartitions(10)
		val model = fpg.run(transactions)
		model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)}
		val minConfidence = 0.8
		model.generateAssociationRules(minConfidence).collect().foreach { rule => println(rule.antecedent.mkString("[", ",", "]") + " => " + rule.consequent .mkString("[", ",", "]") + ", " + rule.confidence)}

26. Association Rules

	26.1 Example
	
		import org.apache.spark.mllib.fpm.AssociationRules
		import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset

		val freqItemsets = sc.parallelize(Seq(new FreqItemset(Array("a"), 15L), new FreqItemset(Array("b"), 35L), new FreqItemset(Array("a", "b"), 12L)))
		val ar = new AssociationRules().setMinConfidence(0.8)
		val results = ar.run(freqItemsets)
		results.collect().foreach { rule => println("[" + rule.antecedent.mkString(",") + "=>" + rule.consequent.mkString(",") + "]," + rule.confidence)}	
		
27. Binary classification model evaluation

	27.1 Example

		import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
		import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
		import org.apache.spark.mllib.regression.LabeledPoint
		import org.apache.spark.mllib.util.MLUtils

		val data = MLUtils.loadLibSVMFile(sc, "/root/labs/datasets/labs/mllib/sample_binary_classification_data.txt")

		val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L)
		training.cache()

		val model = new LogisticRegressionWithLBFGS().setNumClasses(2).run(training)
		model.clearThreshold
		val predictionAndLabels = test.map { case LabeledPoint(label, features) => val prediction = model.predict(features); (prediction, label)}
		val metrics = new BinaryClassificationMetrics(predictionAndLabels)
		val precision = metrics.precisionByThreshold
		precision.foreach { case (t, p) => println(s"Threshold: $t, Precision: $p")}
		val recall = metrics.recallByThreshold
		recall.foreach { case (t, r) => println(s"Threshold: $t, Recall: $r")}
		val PRC = metrics.pr
		val f1Score = metrics.fMeasureByThreshold
		f1Score.foreach { case (t, f) => println(s"Threshold: $t, F-score: $f, Beta = 1")}
		val beta = 0.5
		val fScore = metrics.fMeasureByThreshold(beta)
		f1Score.foreach { case (t, f) => println(s"Threshold: $t, F-score: $f, Beta = 0.5")}
		val auPRC = metrics.areaUnderPR
		println("Area under precision-recall curve = " + auPRC)
		val thresholds = precision.map(_._1)
		val roc = metrics.roc
		val auROC = metrics.areaUnderROC
		println("Area under ROC = " + auROC)	
		
28. PMML model export		

	28.1 Example
	
		import org.apache.spark.mllib.clustering.KMeans
		import org.apache.spark.mllib.linalg.Vectors
		val data = sc.textFile("/root/labs/datasets/labs/mllib/kmeans_data.txt")
		val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache()
		val numClusters = 2
		val numIterations = 20
		val clusters = KMeans.train(parsedData, numClusters, numIterations)
		println("PMML Model:\n" + clusters.toPMML)
		clusters.toPMML("/tmp/kmeans.xml")
		clusters.toPMML(sc,"/tmp/kmeans")
		clusters.toPMML(System.out) Load Data

29. Anomaly detection with k-means

	29.0 Launch Spark Shell using the following command
			
		spark-shell --packages com.databricks:spark-csv_2.10:1.5.0

	29.1 Load Data

		val df = sqlContext.load("com.databricks.spark.csv", Map("path" -> "/root/labs/datasets/kddcupsmall.txt", "header" -> "true"))
	
	29.2 Let's check out the schema
	
		df.printSchema;
		df.groupBy($"label").count().orderBy($"count".desc).registerTempTable("networklabelcount");
		
	29.3 Let's Score ks

		val rawData = sc.textFile("/root/labs/datasets/kddcupsmall.txt");
		rawData.map(_.split(',').last).countByValue().toSeq.sortBy(_._2).reverse.foreach(println);
		import org.apache.spark.mllib.linalg._;
		val labelsAndData = rawData.zipWithIndex.flatMap {
			case (line,index) =>
				if (index == 0) {
					None
				} else {
					val buffer = line.split(',').toBuffer;
					buffer.remove(1, 3);
					val label = buffer.remove(buffer.length-1);
					val vector = Vectors.dense(buffer.map(_.toDouble).toArray);
					Some((label,vector));
				}
		};
		import org.apache.spark.mllib.clustering._; 
		def distance(a: Vector, b: Vector) = math.sqrt(a.toArray.zip(b.toArray).map(p => p._1 - p._2).map(d => d * d).sum);
		def distToCentroid(datum: Vector, model: KMeansModel) = {
			val cluster = model.predict(datum);
			val centroid = model.clusterCenters(cluster);
			distance(centroid, datum);
		};
		import org.apache.spark.rdd._;
		val dataAsArray = labelsAndData.values.map(_.toArray).cache();
		val numCols = dataAsArray.first().length;
		val n = dataAsArray.count();
		val sums = dataAsArray.reduce((a,b) => a.zip(b).map(t => t._1 + t._2));
		val sumSquares = dataAsArray.fold(new Array[Double](numCols))((a,b) => a.zip(b).map(t => t._1 + t._2 * t._2));
		val stdevs = sumSquares.zip(sums).map {case(sumSq,sum) => math.sqrt(n*sumSq - sum*sum)/n};
		val means = sums.map(_ / n);
		def normalize(datum: Vector) = {
			val normalizedArray = (datum.toArray, means, stdevs).zipped.map((value, mean, stdev) =>	if (stdev <= 0) (value - mean) else (value - mean) / stdev);
			Vectors.dense(normalizedArray);
		};
		val normalizedLabelsAndData = labelsAndData.map(ld => (ld._1, normalize(ld._2))).cache();
		def entropy(counts: Iterable[Int]) = {
			val values = counts.filter(_ > 0);
			val n: Double = values.sum;
			values.map { v =>
				val p=v/n;
				-p * math.log(p);
			}.sum;
		}
		val kmeansNumberRuns = 10;
		val kmeansEpsilon = 1.0e-6;
		def clusteringScore(normalizedLabelsAndData: RDD[(String,Vector)], k: Int) = {
			val kmeans = new KMeans();
			kmeans.setRuns(kmeansNumberRuns);
			kmeans.setEpsilon(kmeansEpsilon);
			kmeans.setK(k);
			val model = kmeans.run(normalizedLabelsAndData.values);
			val labelsAndClusters = normalizedLabelsAndData.mapValues(model.predict);
			val clustersAndLabels = labelsAndClusters.map(_.swap);
			val labelsInCluster = clustersAndLabels.groupByKey().values;
			val labelCounts = labelsInCluster.map( _.groupBy(l => l).map(_._2.size));
			val n = normalizedLabelsAndData.count();
			labelCounts.map(m => m.sum * entropy(m)).sum / n;
		};

		val kAndScore = (10 to 60 by 10).par.map(k => (k, clusteringScore(normalizedLabelsAndData, k))).toList;
		kAndScore.foreach(println);
		case class kmeansScore(k: Int, score: Double);
		sc.parallelize(kAndScore).map{ case(a, b) => kmeansScore(a, b) }.toDF.registerTempTable("kscore");

	29.4 Find the best k
	
		sqlContext.sql("select * from kscore").collect().foreach(println);
		
		val kmeans = new KMeans();
		kmeans.setRuns(kmeansNumberRuns);
		kmeans.setEpsilon(kmeansEpsilon);
		kmeans.setK(40);
		val model = kmeans.run(normalizedLabelsAndData.values);

		val clusterLabel = normalizedLabelsAndData.map {
			case (label, data) => val cluster = model.predict(data);
			(cluster, label)
		};
		clusterLabel.countByValue.toSeq.sorted.foreach {
			case ((cluster,label),count) =>
				println(f"$cluster%1s$label%18s$count%8s");
		};
		case class clusterLabelType(cluster: Int, label: String);
		clusterLabel.map {
			case (cluster, label) => clusterLabelType(cluster, label);
		}.toDF.registerTempTable("clusterLabel")

	29.5 Easier if we visualize it
		
		sqlContext.sql("select cluster, count(label) as count, label from clusterLabel group by cluster, label").collect().foreach(println);
	
	
	29.6 Let's find the anomalies
	
		val distances = normalizedLabelsAndData.map {
			case (label, data) => distToCentroid(data, model)
		};
		val threshold = distances.top(100).last;
		val originalAndData = rawData.zipWithIndex.flatMap {
			case (line,index) =>
				if (index == 0) {
					None
				} else {
					val buffer = line.split(',').toBuffer;
					buffer.remove(1, 3);
					val label = buffer.remove(buffer.length-1);
					val vector = Vectors.dense(buffer.map(_.toDouble).toArray);
					Some((line,vector));
				};
		};

		val anomalies = originalAndData.filter {
			case (original, data) =>
				val normalized = normalize(data);
				distToCentroid(normalized, model) > threshold
		}.keys;
		anomalies.toDF.registerTempTable("anomalies");

		sqlContext.sql("select `_1` as entires from anomalies").collect().foreach(println);
		
		
30. Topic Distribution (LDA)
	
	30.1 Load the dataset
	
		val itemsDF = sqlContext.read.format("json").load("/root/labs/datasets/nlp/country-lyrics.json").select($"id", $"title", $"url", $"lyrics")
		
	30.2 Tokenize

		import org.apache.spark.ml.feature.RegexTokenizer;
		val tokenizer = new RegexTokenizer().setInputCol("lyrics").setOutputCol("words").setGaps(false).setPattern("\\p{L}+");
		
	30.3 Remove Common stop words

		import org.apache.spark.ml.feature.StopWordsRemover;
		val stopWordsFilter = new StopWordsRemover().setInputCol(tokenizer.getOutputCol).setOutputCol("filteredWords").setCaseSensitive(false);
		val stopWords = stopWordsFilter.getStopWords;
		val newStopWords = Array("did", "s", "t", "m", "n", "uh", "ll", "ha", "makes", "make", "yeah", "goes", 
			"gettin", "v", "went", "aint", "let", "d", "yer", "don", "got", "just", "ain", "ve", "come", "gonna", 
			"says", "oh", "like", "way", "little", "said", "cause", "know", "say", "long", "time", "day", "wanna",
			"chorus", "repeat", "didn", "hey", "couldn", "wouldn", "ooh", "lets", "ol", "round", "right", "does", 
			"won", "ya", "outta", "tell", "doin", "doing", "boondocks", "nothin", "feelin", "mornin", "whoa", 
			"big", "havin", "comes", "sure", "yea", "mind", "best", "brr", "y") ++ stopWords;
		stopWordsFilter.setStopWords(newStopWords);
		
	30.4 Remove Words below a minimum Document and term Frequency

		import org.apache.spark.ml.feature.CountVectorizer;
		val vocabSize: Int = 500;
		val countVectorizer = new CountVectorizer().setInputCol(stopWordsFilter.getOutputCol).setOutputCol("countFeatures").setVocabSize(vocabSize).setMinDF(3).setMinTF(2);	
		
	30.5 Add LDA to the pipeline

		import org.apache.spark.ml.clustering.LDA;
		val maxIterations: Int = 100;
		val numTopics: Int = 5;
		val lda = new LDA().setFeaturesCol(countVectorizer.getOutputCol).setOptimizer("online").setK(numTopics).setMaxIter(maxIterations).setTopicDistributionCol("topicDistribution");
		
	30.6 cluster the songs using topic distribution vector similarity
	
		import org.apache.spark.ml.clustering.KMeans;
		val kmeans = new KMeans().setK(10).setFeaturesCol(lda.getTopicDistributionCol).setPredictionCol("cluster");
		
	30.7 Setup the pipeline

		import org.apache.spark.ml.Pipeline;
		val pipeline = new Pipeline().setStages(Array(tokenizer, stopWordsFilter, countVectorizer, lda, kmeans));
		
	30.8 Fit the Pipeline Model to Input data

		val pipelineModel = pipeline.fit(itemsDF);
		
	30.9 Transform to product the topic distribution

		val pipelineResultsDF = pipelineModel.transform(itemsDF);	
		
	30.10 Calculate the logLikelihood and logPerplexity

		import org.apache.spark.ml.clustering.LDAModel;
		val ldaModel = pipelineModel.stages(3).asInstanceOf[LDAModel];
		val logLikelihood = ldaModel.logLikelihood(pipelineResultsDF);
		val logPerplexity = ldaModel.logPerplexity(pipelineResultsDF);

	30.11 Describe the topics

		val topicsDF = ldaModel.describeTopics(maxTermsPerTopic = 5);	
		
	30.12 Retrieve the Complete Vocabulary Array

		import org.apache.spark.ml.feature.CountVectorizerModel;
		val vocabArray = pipelineModel.stages(2).asInstanceOf[CountVectorizerModel].vocabulary;
		
	30.13 Enrich the (Index, Weight) mappings to use word, instead

		val termWeightsPerTopicRDD = topicsDF.select($"termIndices", $"termWeights").map(row => {
			val terms = row.getSeq[Int](0);
			val termWeights = row.getSeq[Double](1);
			terms.map(idx => vocabArray(idx)).zip(termWeights);
		})	
		
	30.14 Show top words per topic

		println(s"\n$numTopics topics:\n");
		termWeightsPerTopicRDD.collect().zipWithIndex.foreach { case (topic, i) => 
			println(s"Topic $i"); 
			topic.foreach { case (term, weight) => println(s"$term\t$weight") };
			println(s"==========");
		}	
		
31. Synonyms (Word2Vec)

	31.1 Load the Data
		
		val itemsDF = sqlContext.read.format("json").load("/root/labs/datasets/nlp/country-lyrics.json").select($"id", $"title", $"url", $"lyrics");
		
	31.2 Tokenize
			
		import org.apache.spark.ml.feature.RegexTokenizer;
		val tokenizer = new RegexTokenizer().setInputCol("lyrics").setOutputCol("words").setGaps(false).setPattern("\\p{L}+");
		
	31.3 Add Word2Vec to the pipeline
	
		import org.apache.spark.ml.feature.Word2Vec;
		val word2Vec = new Word2Vec().setInputCol(tokenizer.getOutputCol).setOutputCol("word2VecFeatures");
		import org.apache.spark.ml.Pipeline
		val pipeline = new Pipeline().setStages(Array(tokenizer, word2Vec));
		
	31.4 Build the Model	
	
		val model = pipeline.fit(itemsDF);
		val word2VecDF = model.transform(itemsDF);
		import org.apache.spark.ml.feature.Word2VecModel;
		val word2VecModel = model.stages(1).asInstanceOf[Word2VecModel];
		
	31.5 Find the Synonyms for Truck	

		val term = "truck";
		val synonymsDF = word2VecModel.findSynonyms(term, 5).collect().foreach(println);

	31.6 Find the Synonyms for Love	

		val term = "love";
		val synonymsDF = word2VecModel.findSynonyms(term, 5).collect().foreach(println);

	31.7 Find the Synonyms for God	

		val term = "god";
		val synonymsDF = word2VecModel.findSynonyms(term, 5).collect().foreach(println);

32. Predicting airline delays

	32.0 Launch Spark Shell using the following command
			
		spark-shell --packages joda-time:joda-time:2.9.1
		
	32.1 Pre-processing with Hadoop and Spark

		import org.apache.spark.rdd._;
		import scala.collection.JavaConverters._;
		import au.com.bytecode.opencsv.CSVReader;
		import java.io._;
		import org.joda.time._;
		import org.joda.time.format._;
		import org.joda.time.format.DateTimeFormat;
		import org.joda.time.DateTime;
		import org.joda.time.Days;

		case class DelayRec(year: String,
                    month: String,
                    dayOfMonth: String,
                    dayOfWeek: String,
                    crsDepTime: String,
                    depDelay: String,
                    origin: String,
                    distance: String,
                    cancelled: String) {
			val holidays = List("01/01/2007", "01/15/2007", "02/19/2007", "05/28/2007", "06/07/2007", "07/04/2007",
			"09/03/2007", "10/08/2007" ,"11/11/2007", "11/22/2007", "12/25/2007",
			"01/01/2008", "01/21/2008", "02/18/2008", "05/22/2008", "05/26/2008", "07/04/2008",
			"09/01/2008", "10/13/2008" ,"11/11/2008", "11/27/2008", "12/25/2008");

			def gen_features: (String, Array[Double]) = {
				val values = Array(depDelay.toDouble,month.toDouble,dayOfMonth.toDouble,dayOfWeek.toDouble,get_hour(crsDepTime).toDouble,distance.toDouble,
							days_from_nearest_holiday(year.toInt, month.toInt, dayOfMonth.toInt));
				new Tuple2(to_date(year.toInt, month.toInt, dayOfMonth.toInt), values)
			};
		
			def get_hour(depTime: String) : String = "%04d".format(depTime.toInt).take(2);
			def to_date(year: Int, month: Int, day: Int) = "%04d%02d%02d".format(year, month, day);
			def days_from_nearest_holiday(year:Int, month:Int, day:Int): Int = {
				val sampleDate = new DateTime(year, month, day, 0, 0);
				holidays.foldLeft(3000) { (r, c) =>
					val holiday = DateTimeFormat.forPattern("MM/dd/yyyy").parseDateTime(c);
					val distance = Math.abs(Days.daysBetween(holiday, sampleDate).getDays);
					math.min(r, distance);
				};
			}
		}

		def prepFlightDelays(infile: String): RDD[DelayRec] = {
			val data = sc.textFile(infile);
			data.map { line =>
				val reader = new CSVReader(new StringReader(line));
				reader.readAll().asScala.toList.map(rec => DelayRec(rec(0),rec(1),rec(2),rec(3),rec(5),rec(15),rec(16),rec(18),rec(21)));
			}.map(list => list(0)).filter(rec => rec.year != "Year").filter(rec => rec.cancelled == "0").filter(rec => rec.origin == "ORD");
		};

		val data_2007tmp = prepFlightDelays("/tmp/airflightsdelays/flights_2007.csv.bz2");
		val data_2007 = data_2007tmp.map(rec => rec.gen_features._2);
		val data_2008 = prepFlightDelays("/tmp/airflightsdelays/flights_2008.csv.bz2").map(rec => rec.gen_features._2);

		data_2007tmp.toDF().registerTempTable("data_2007tmp");
		data_2007.take(5).map(x => x mkString ",").foreach(println);
		
	32.2 Lets explore data using SQL and visualizations

		sqlContext.sql("select dayofWeek, case when depDelay > 15 then 'delayed' else 'ok' end , count(1) from data_2007tmp group by dayofweek , case when depDelay > 15 then 'delayed' else 'ok' end ").collect().foreach(println);
		
		sqlContext.sql("select cast( cast(crsDepTime as int) / 100 as int) as hour,  case when depDelay > 15 then 'delayed' else 'ok' end as delay, count(1) as count from  data_2007tmp group by  cast( cast(crsDepTime as int) / 100 as int),  case when depDelay > 15 then 'delayed' else 'ok' end").collect().foreach(println);

	32.3 Modeling with Spark and ML-Lib

		import org.apache.spark.mllib.regression.LabeledPoint;
		import org.apache.spark.mllib.linalg.Vectors;
		import org.apache.spark.mllib.feature.StandardScaler;

		def parseData(vals: Array[Double]): LabeledPoint = {
			LabeledPoint(if (vals(0)>=15) 1.0 else 0.0, Vectors.dense(vals.drop(1)));
		};

		val parsedTrainData = data_2007.map(parseData);
		parsedTrainData.cache;
		val scaler = new StandardScaler(withMean = true, withStd = true).fit(parsedTrainData.map(x => x.features));
		val scaledTrainData = parsedTrainData.map(x => LabeledPoint(x.label, scaler.transform(Vectors.dense(x.features.toArray))));
		scaledTrainData.cache;

		val parsedTestData = data_2008.map(parseData);
		parsedTestData.cache;
		val scaledTestData = parsedTestData.map(x => LabeledPoint(x.label, scaler.transform(Vectors.dense(x.features.toArray))));
		scaledTestData.cache;

		scaledTrainData.take(3).map(x => (x.label, x.features)).foreach(println);
		
		
		def eval_metrics(labelsAndPreds: RDD[(Double, Double)]) : Tuple2[Array[Double], Array[Double]] = {
			val tp = labelsAndPreds.filter(r => r._1==1 && r._2==1).count.toDouble;
			val tn = labelsAndPreds.filter(r => r._1==0 && r._2==0).count.toDouble;
			val fp = labelsAndPreds.filter(r => r._1==1 && r._2==0).count.toDouble;
			val fn = labelsAndPreds.filter(r => r._1==0 && r._2==1).count.toDouble;

			val precision = tp / (tp+fp);
			val recall = tp / (tp+fn);
			val F_measure = 2*precision*recall / (precision+recall);
			val accuracy = (tp+tn) / (tp+tn+fp+fn);
			new Tuple2(Array(tp, tn, fp, fn), Array(precision, recall, F_measure, accuracy));
		}

		import org.apache.spark.rdd._;
		import org.apache.spark.rdd.RDD;

		class Metrics(labelsAndPreds: RDD[(Double, Double)]) extends java.io.Serializable {
			private def filterCount(lftBnd:Int,rtBnd:Int):Double = labelsAndPreds.map(x => (x._1.toInt, x._2.toInt)).filter(_ == (lftBnd,rtBnd)).count();
			lazy val tp = filterCount(1,1);  
			lazy val tn = filterCount(0,0);  
			lazy val fp = filterCount(0,1);  
			lazy val fn = filterCount(1,0); 

			lazy val precision = tp / (tp+fp);
			lazy val recall = tp / (tp+fn);
			lazy val F1 = 2*precision*recall / (precision+recall);
			lazy val accuracy = (tp+tn) / (tp+tn+fp+fn);
		};
		
	32.4 Logistic Regression with SGD	
	
		import org.apache.spark.mllib.classification.LogisticRegressionWithSGD;
		val model_lr = LogisticRegressionWithSGD.train(scaledTrainData, numIterations=100);
		val labelsAndPreds_lr = scaledTestData.map { point => val pred = model_lr.predict(point.features); (pred, point.label)};
		val m_lr = eval_metrics(labelsAndPreds_lr)._2;
		println("precision = %.2f, recall = %.2f, F1 = %.2f, accuracy = %.2f".format(m_lr(0), m_lr(1), m_lr(2), m_lr(3)));
		println(model_lr.weights);
	
		
	32.5 SVM with SGD

		import org.apache.spark.mllib.classification.SVMWithSGD;
		val svmAlg = new SVMWithSGD();
		svmAlg.optimizer.setNumIterations(100).setRegParam(1.0).setStepSize(1.0);
		val model_svm = svmAlg.run(scaledTrainData);
		val labelsAndPreds_svm = scaledTestData.map { point =>
			val pred = model_svm.predict(point.features);
			(pred, point.label)
		};
		val m_svm = eval_metrics(labelsAndPreds_svm)._2;
		println("precision = %.2f, recall = %.2f, F1 = %.2f, accuracy = %.2f".format(m_svm(0), m_svm(1), m_svm(2), m_svm(3)));
		
	32.6 Decision Tree

		import org.apache.spark.mllib.tree.DecisionTree;
		val numClasses = 2;
		val categoricalFeaturesInfo = Map[Int, Int]();
		val impurity = "gini";
		val maxDepth = 10;
		val maxBins = 100;
		val model_dt = DecisionTree.trainClassifier(parsedTrainData, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins);
		val labelsAndPreds_dt = parsedTestData.map { point =>
			val pred = model_dt.predict(point.features);
			(pred, point.label);
		};
		val m_dt = eval_metrics(labelsAndPreds_dt)._2;
		println("precision = %.2f, recall = %.2f, F1 = %.2f, accuracy = %.2f".format(m_dt(0), m_dt(1), m_dt(2), m_dt(3)));


	32.7 Random Forest
	
		import org.apache.spark.mllib.tree.RandomForest;
		import org.apache.spark.mllib.tree.configuration.Strategy;
		val treeStrategy = Strategy.defaultStrategy("Classification");
		val numTrees = 100; 
		val featureSubsetStrategy = "auto";
		val model_rf = RandomForest.trainClassifier(parsedTrainData, treeStrategy, numTrees, featureSubsetStrategy, seed = 123);
		val labelsAndPreds_rf = parsedTestData.map { point =>
			val pred = model_rf.predict(point.features);
			(point.label, pred);
		};
		val m_rf = new Metrics(labelsAndPreds_rf);
		println("precision = %.2f, recall = %.2f, F1 = %.2f, accuracy = %.2f".format(m_rf.precision, m_rf.recall, m_rf.F1, m_rf.accuracy));
		
	
	32.8 Building a richer model with flight delays, weather data using Apache Spark and ML-Lib

		import org.apache.spark.SparkContext._;
		import scala.collection.JavaConverters._;
		import au.com.bytecode.opencsv.CSVReader;
		import java.io._;

		def preprocess_spark(delay_file: String, weather_file: String): RDD[Array[Double]] = { 
			val delayRecs = prepFlightDelays(delay_file).map{ rec => 
				val features = rec.gen_features;
				(features._1, features._2);
			};

			val station_inx = 0;
			val date_inx = 1;
			val metric_inx = 2;
			val value_inx = 3;

			def filterMap(wdata:RDD[Array[String]], metric:String):RDD[(String,Double)] = {
				wdata.filter(vals => vals(metric_inx) == metric).map(vals => (vals(date_inx), vals(value_inx).toDouble));
			};

			val wdata = sc.textFile(weather_file).map(line => line.split(",")).filter(vals => vals(station_inx) == "USW00094846");
			val w_tmin = filterMap(wdata,"TMIN");
			val w_tmax = filterMap(wdata,"TMAX");
			val w_prcp = filterMap(wdata,"PRCP");
			val w_snow = filterMap(wdata,"SNOW");
			val w_awnd = filterMap(wdata,"AWND");

			delayRecs.join(w_tmin).map(vals => (vals._1, vals._2._1 ++ Array(vals._2._2))).join(w_tmax).map(vals => (vals._1, vals._2._1 ++ Array(vals._2._2))).join(w_prcp).map(vals => (vals._1, vals._2._1 ++ Array(vals._2._2))).join(w_snow).map(vals => (vals._1, vals._2._1 ++ Array(vals._2._2))).join(w_awnd).map(vals => vals._2._1 ++ Array(vals._2._2));
		};

		val data_2007 = preprocess_spark("/tmp/airflightsdelays/flights_2007.csv.bz2", "/tmp/airflightsdelays/weather_2007.csv.gz");
		val data_2008 = preprocess_spark("/tmp/airflightsdelays/flights_2008.csv.bz2", "/tmp/airflightsdelays/weather_2008.csv.gz");

		data_2007.take(5).map(x => x mkString ",").foreach(println);
		
		
	32.9 Modeling with weather data

		import org.apache.spark.mllib.regression.LabeledPoint;
		import org.apache.spark.mllib.linalg.Vectors;
		import org.apache.spark.mllib.feature.StandardScaler;

		def parseData(vals: Array[Double]): LabeledPoint = {
			LabeledPoint(if (vals(0)>=15) 1.0 else 0.0, Vectors.dense(vals.drop(1)));
		};

		val parsedTrainData = data_2007.map(parseData);
		val scaler = new StandardScaler(withMean = true, withStd = true).fit(parsedTrainData.map(x => x.features));
		val scaledTrainData = parsedTrainData.map(x => LabeledPoint(x.label, scaler.transform(Vectors.dense(x.features.toArray))));
		parsedTrainData.cache;
		scaledTrainData.cache;

		val parsedTestData = data_2008.map(parseData);
		val scaledTestData = parsedTestData.map(x => LabeledPoint(x.label, scaler.transform(Vectors.dense(x.features.toArray))));
		parsedTestData.cache;
		scaledTestData.cache;

		scaledTrainData.take(5).map(x => (x.label, x.features)).foreach(println);
		
	32.10 LogisticRegressionWithSGD

		import org.apache.spark.mllib.classification.LogisticRegressionWithSGD;
		val model_lr = LogisticRegressionWithSGD.train(scaledTrainData, numIterations=100);
		val labelsAndPreds_lr = scaledTestData.map { point =>
			val pred = model_lr.predict(point.features);
			(point.label, pred);
		};
		val m_lr = new Metrics(labelsAndPreds_lr);
		println("precision = %.2f, recall = %.2f, F1 = %.2f, accuracy = %.2f".format(m_lr.precision, m_lr.recall, m_lr.F1, m_lr.accuracy));
		println(model_lr.weights);

	32.11 Decision Tree
	
		import org.apache.spark.mllib.tree.DecisionTree;
		val numClasses = 2;
		val categoricalFeaturesInfo = Map[Int, Int]();
		val impurity = "gini";
		val maxDepth = 10;
		val maxBins = 100;
		val model_dt = DecisionTree.trainClassifier(parsedTrainData, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins);
		val labelsAndPreds_dt = parsedTestData.map { point =>
			val pred = model_dt.predict(point.features);
			(point.label, pred);
		};
		val m_dt = new Metrics(labelsAndPreds_dt);
		println("precision = %.2f, recall = %.2f, F1 = %.2f, accuracy = %.2f".format(m_dt.precision, m_dt.recall, m_dt.F1, m_dt.accuracy));

	32.12 Random Forest

		import org.apache.spark.mllib.tree.RandomForest;
		import org.apache.spark.mllib.tree.configuration.Strategy;
		val treeStrategy = Strategy.defaultStrategy("Classification");
		val model_rf = RandomForest.trainClassifier(parsedTrainData, treeStrategy, numTrees = 100, featureSubsetStrategy = "auto", seed = 125);
		val labelsAndPreds_rf = parsedTestData.map { point =>
			val pred = model_rf.predict(point.features);
			(point.label, pred);
		};
		val m_rf = new Metrics(labelsAndPreds_rf);
		println("precision = %.2f, recall = %.2f, F1 = %.2f, accuracy = %.2f".format(m_rf.precision, m_rf.recall, m_rf.F1, m_rf.accuracy));
		

33. Advanced Machine Learning Skills

	import org.apache.spark.ml.linalg.Vectors
	import org.apache.spark.ml.feature.RFormula
	import org.apache.spark.ml.classification.LogisticRegression
	import org.apache.spark.ml.Pipeline
	import org.apache.spark.ml.tuning.ParamGridBuilder
	import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
	import org.apache.spark.ml.tuning.TrainValidationSplit
	import org.apache.spark.ml.PipelineModel
	import org.apache.spark.ml.classification.LogisticRegressionModel
	import org.apache.spark.ml.tuning.TrainValidationSplitModel

	val denseVec = Vectors.dense(1.0, 2.0, 3.0)
	val size = 3
	val idx = Array(1,2) 
	val values = Array(2.0,3.0)
	val sparseVec = Vectors.sparse(size, idx, values)
	
	sparseVec.toDense
	denseVec.toSparse

	var df = spark.read.json("file:///root/TrainingOnHDP/dataset/spark/mllib/simple-ml")
	df.orderBy("value2").show()

	spark.read.format("libsvm").load("file:///root/TrainingOnHDP/dataset/spark/mllib/sample_libsvm_data.txt")

	val supervised = new RFormula().setFormula("lab ~ . + color:value1 + color:value2")

	val fittedRF = supervised.fit(df)
	val preparedDF = fittedRF.transform(df)
	preparedDF.show()

	val Array(train, test) = preparedDF.randomSplit(Array(0.7, 0.3))

	val lr = new LogisticRegression().setLabelCol("label").setFeaturesCol("features")

	println(lr.explainParams())

	val fittedLR = lr.fit(train)

	fittedLR.transform(train).select("label", "prediction").show()

	val Array(train, test) = df.randomSplit(Array(0.7, 0.3))

	val rForm = new RFormula()
	val lr = new LogisticRegression().setLabelCol("label").setFeaturesCol("features")

	val stages = Array(rForm, lr)
	val pipeline = new Pipeline().setStages(stages)

	val params = new ParamGridBuilder()
		.addGrid(rForm.formula, Array("lab ~ . + color:value1",	"lab ~ . + color:value1 + color:value2"))
		.addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0))
		.addGrid(lr.regParam, Array(0.1, 2.0))
		.build()

	val evaluator = new BinaryClassificationEvaluator()
			.setMetricName("areaUnderROC")
			.setRawPredictionCol("prediction")
			.setLabelCol("label")

	val tvs = new TrainValidationSplit()
			.setTrainRatio(0.75)
			.setEstimatorParamMaps(params)
			.setEstimator(pipeline)
			.setEvaluator(evaluator)

	val tvsFitted = tvs.fit(train)

	evaluator.evaluate(tvsFitted.transform(test))

	val trainedPipeline = tvsFitted.bestModel.asInstanceOf[PipelineModel]
	val TrainedLR = trainedPipeline.stages(1).asInstanceOf[LogisticRegressionModel]
	val summaryLR = TrainedLR.summary
	summaryLR.objectiveHistory

	tvsFitted.write.overwrite().save("/tmp/modelLocation")

	val model = TrainValidationSplitModel.load("/tmp/modelLocation")
	
	model.transform(test)


34. Feature Engineering

	import org.apache.spark.ml.feature.Tokenizer;
	import org.apache.spark.ml.feature.StandardScaler;
	import org.apache.spark.ml.feature.RFormula;
	import org.apache.spark.ml.feature.SQLTransformer;
	import org.apache.spark.ml.feature.VectorAssembler;
	import org.apache.spark.ml.feature.Bucketizer;
	import org.apache.spark.ml.feature.QuantileDiscretizer;
	import org.apache.spark.ml.feature.StandardScaler;
	import org.apache.spark.ml.feature.MinMaxScaler;
	import org.apache.spark.ml.feature.MaxAbsScaler;
	import org.apache.spark.ml.feature.ElementwiseProduct;
	import org.apache.spark.ml.linalg.Vectors;
	import org.apache.spark.ml.feature.Normalizer;
	import org.apache.spark.ml.feature.StringIndexer;
	import org.apache.spark.ml.feature.IndexToString;
	import org.apache.spark.ml.feature.VectorIndexer;
	import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoder};
	import org.apache.spark.ml.feature.Tokenizer;
	import org.apache.spark.ml.feature.RegexTokenizer;
	import org.apache.spark.ml.feature.RegexTokenizer;
	import org.apache.spark.ml.feature.StopWordsRemover;
	import org.apache.spark.ml.feature.NGram;
	import org.apache.spark.ml.feature.CountVectorizer;
	import org.apache.spark.ml.feature.{HashingTF, IDF};
	import org.apache.spark.ml.feature.Word2Vec;
	import org.apache.spark.ml.linalg.Vector;
	import org.apache.spark.sql.Row;
	import org.apache.spark.ml.feature.PCA;
	import org.apache.spark.ml.feature.PolynomialExpansion;
	import org.apache.spark.ml.feature.{ChiSqSelector, Tokenizer};
	import org.apache.spark.ml.feature.PCAModel;
	import org.apache.spark.ml.UnaryTransformer;
	import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable};
	import org.apache.spark.sql.types.{ArrayType, StringType, DataType};
	import org.apache.spark.ml.param.{IntParam, ParamValidators};
	import org.apache.spark.ml.evaluation.{MulticlassClassificationEvaluator, BinaryClassificationEvaluator};

	
	val sales = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("file:///root/TrainingOnHDP/dataset/spark/mllib/retail-data/by-day/*.csv").coalesce(5).where("Description IS NOT NULL")
	val fakeIntDF = spark.read.parquet("file:///root/TrainingOnHDP/dataset/spark/mllib/simple-ml-integers")
	var simpleDF = spark.read.json("file:///root/TrainingOnHDP/dataset/spark/mllib/simple-ml")
	val scaleDF = spark.read.parquet("file:///root/TrainingOnHDP/dataset/spark/mllib/simple-ml-scaling")

	sales.cache()
	sales.show()

	val tkn = new Tokenizer().setInputCol("Description")
	tkn.transform(sales.select("Description")).show(false)

	val ss = new StandardScaler().setInputCol("features")
	ss.fit(scaleDF).transform(scaleDF).show(false)

	val supervised = new RFormula().setFormula("lab ~ . + color:value1 + color:value2")
	supervised.fit(simpleDF).transform(simpleDF).show()

	val basicTransformation = new SQLTransformer().setStatement("""
		SELECT sum(Quantity), count(*), CustomerID
		FROM __THIS__
		GROUP BY CustomerID
	""")

	basicTransformation.transform(sales).show()

	val va = new VectorAssembler().setInputCols(Array("int1", "int2", "int3"))
	va.transform(fakeIntDF).show()

	val contDF = spark.range(20).selectExpr("cast(id as double)")

	val bucketBorders = Array(-1.0, 5.0, 10.0, 250.0, 600.0)
	val bucketer = new Bucketizer().setSplits(bucketBorders).setInputCol("id")
	bucketer.transform(contDF).show()

	val bucketer = new QuantileDiscretizer().setNumBuckets(5).setInputCol("id")
	val fittedBucketer = bucketer.fit(contDF)
	fittedBucketer.transform(contDF).show()

	val sScaler = new StandardScaler().setInputCol("features")
	sScaler.fit(scaleDF).transform(scaleDF).show()

	val minMax = new MinMaxScaler().setMin(5).setMax(10).setInputCol("features")
	val fittedminMax = minMax.fit(scaleDF)
	fittedminMax.transform(scaleDF).show()

	val maScaler = new MaxAbsScaler().setInputCol("features")
	val fittedmaScaler = maScaler.fit(scaleDF)
	fittedmaScaler.transform(scaleDF).show()

	val scaleUpVec = Vectors.dense(10.0, 15.0, 20.0)
	val scalingUp = new ElementwiseProduct().setScalingVec(scaleUpVec).setInputCol("features")
	scalingUp.transform(scaleDF).show()

	val manhattanDistance = new Normalizer().setP(1).setInputCol("features")
	manhattanDistance.transform(scaleDF).show()

	val lblIndxr = new StringIndexer().setInputCol("lab").setOutputCol("labelInd")
	val idxRes = lblIndxr.fit(simpleDF).transform(simpleDF)
	idxRes.show()

	val valIndexer = new StringIndexer().setInputCol("value1").setOutputCol("valueInd")

	valIndexer.fit(simpleDF).transform(simpleDF).show()

	valIndexer.setHandleInvalid("skip")
	valIndexer.fit(simpleDF).setHandleInvalid("skip")

	val labelReverse = new IndexToString().setInputCol("labelInd")
	labelReverse.transform(idxRes).show()

	val idxIn = spark.createDataFrame(Seq(
			(Vectors.dense(1, 2, 3),1),
			(Vectors.dense(2, 5, 6),2),
			(Vectors.dense(1, 8, 9),3)
	)).toDF("features", "label")

	val indxr = new VectorIndexer().setInputCol("features").setOutputCol("idxed").setMaxCategories(2)
	indxr.fit(idxIn).transform(idxIn).show

	val lblIndxr = new StringIndexer().setInputCol("color").setOutputCol("colorInd")
	val colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color"))
	val ohe = new OneHotEncoder().setInputCol("colorInd")
	ohe.transform(colorLab).show()

	val tkn = new Tokenizer().setInputCol("Description").setOutputCol("DescOut")
	val tokenized = tkn.transform(sales.select("Description"))
	tokenized.show(false)

	val rt = new RegexTokenizer()
		.setInputCol("Description")
		.setOutputCol("DescOut")
		.setPattern(" ")
		.setToLowercase(true)
	
	rt.transform(sales.select("Description")).show(false)

	val rt = new RegexTokenizer()
		.setInputCol("Description")
		.setOutputCol("DescOut")
		.setPattern(" ")
		.setGaps(false)
		.setToLowercase(true)

	rt.transform(sales.select("Description")).show(false)

	val englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
	val stops = new StopWordsRemover().setStopWords(englishStopWords).setInputCol("DescOut")

	stops.transform(tokenized).show()

	val unigram = new NGram().setInputCol("DescOut").setN(1)
	val bigram = new NGram().setInputCol("DescOut").setN(2)
	unigram.transform(tokenized.select("DescOut")).show(false)
	bigram.transform(tokenized.select("DescOut")).show(false)

	val cv = new CountVectorizer()
		.setInputCol("DescOut")
		.setOutputCol("countVec")
		.setVocabSize(500)
		.setMinTF(1)
		.setMinDF(2)
	
	val fittedCV = cv.fit(tokenized)
	fittedCV.transform(tokenized).show(false)

	val tfIdfIn = tokenized.where("array_contains(DescOut, 'red')").select("DescOut").limit(10)
	tfIdfIn.show(false)

	val tf = new HashingTF().setInputCol("DescOut").setOutputCol("TFOut").setNumFeatures(10000)
	val idf = new IDF().setInputCol("TFOut").setOutputCol("IDFOut").setMinDocFreq(2)

	idf.fit(tf.transform(tfIdfIn)).transform(tf.transform(tfIdfIn)).show(false)

	val documentDF = spark.createDataFrame(Seq(
		"Hi I heard about Spark".split(" "),
		"I wish Java could use case classes".split(" "),
		"Logistic regression models are neat".split(" ")
	).map(Tuple1.apply)).toDF("text")

	val word2Vec = new Word2Vec()
		.setInputCol("text")
		.setOutputCol("result")
		.setVectorSize(3)
		.setMinCount(0)

	val model = word2Vec.fit(documentDF)
	val result = model.transform(documentDF)
	result.collect().foreach { case Row(text: Seq[_], features: Vector) =>
		println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n")
	}

	val pca = new PCA().setInputCol("features").setK(2)
	pca.fit(scaleDF).transform(scaleDF).show(false)

	val pe = new PolynomialExpansion().setInputCol("features").setDegree(2)
	pe.transform(scaleDF).show(false)

	val tkn = new Tokenizer().setInputCol("Description").setOutputCol("DescOut")
	val tokenized = tkn.transform(sales.select("Description", "CustomerId")).where("CustomerId IS NOT NULL")

	val prechi = fittedCV.transform(tokenized)
	val chisq = new ChiSqSelector()
		.setFeaturesCol("countVec")
		.setLabelCol("CustomerId")
		.setNumTopFeatures(2)

	chisq.fit(prechi).transform(prechi).drop("customerId", "Description", "DescOut").show()

	val fittedPCA = pca.fit(scaleDF)
	fittedPCA.write.overwrite().save("/tmp/fittedPCA")

	val loadedPCA = PCAModel.load("/tmp/fittedPCA")
	loadedPCA.transform(scaleDF).show()

	class MyTokenizer(override val uid: String) extends UnaryTransformer[String, Seq[String], MyTokenizer] with DefaultParamsWritable {
		def this() = this(Identifiable.randomUID("myTokenizer"))
		val maxWords: IntParam = new IntParam(this, "maxWords", "The max number of words to return.", ParamValidators.gtEq(0))
		def setMaxWords(value: Int): this.type = set(maxWords, value)
		def getMaxWords: Integer = $(maxWords)
		override protected def createTransformFunc: String => Seq[String] = ( inputString: String) => {
			inputString.split("\\s").take($(maxWords))
		}
		override protected def validateInputType(inputType: DataType): Unit = {
			require(inputType == StringType, s"Bad input type: $inputType. Requires String.")
		}
		override protected def outputDataType: DataType = new ArrayType(StringType, true)
	}

	object MyTokenizer extends DefaultParamsReadable[MyTokenizer]

	val myT = new MyTokenizer().setInputCol("someCol").setMaxWords(2)
	myT.transform(Seq("hello world. This text won't show.").toDF("someCol")).show()

	
35. Implementation of Spark ML Classification Model

	35.1 Prepare the input data
	
		cd /root
		wget https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip
		unzip -x dataset_diabetes.zip
		/root/dataset_diabetes/diabetic_data.csv and /root/dataset_diabetes/IDs_mapping.csv are raw input data set
		
	35.2 Launch Spark-shell

	35.3 Import the following classes

		import spark.implicits._;
		import org.apache.spark.sql.types._;
		import org.apache.spark.sql.{DataFrameNaFunctions, Row};
		import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, IndexToString, VectorIndexer, OneHotEncoder, PCA, Binarizer, VectorSlicer, StandardScaler, Bucketizer, ChiSqSelector, Normalizer };
		import org.apache.spark.ml.Pipeline;
		import org.apache.spark.sql.functions.{ sum,when , row_number, max, broadcast};
		import org.apache.spark.sql.expressions.Window;
		import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier, LogisticRegression, DecisionTreeClassificationModel, DecisionTreeClassifier, GBTClassificationModel, GBTClassifier};
		import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder};
		import org.apache.spark.ml.linalg.{Vector, Vectors};
	
	35.4 Features
	
		Feature name				Type		Description and values													% missing
		
		Encounter ID				Numeric		Unique identifier of an encounter											0%
		Patient number				Numeric		Unique identifier of a patient												0%
		Race						Nominal		Values: Caucasian, Asian, African American, Hispanic, and other				2%
		Gender						Nominal		Values: male, female, and unknown/invalid									0%
		Age							Nominal		Grouped in 10-year intervals: 0, 10), 10, 20), …, 90, 100)					0%
		Weight						Numeric		Weight in pounds.															97%
		Admission type				Nominal		Integer identifier corresponding to 9 distinct values, 
												for example, emergency, urgent, elective, newborn, and not available		0%
		Discharge disposition		Nominal		Integer identifier corresponding to 29 distinct values, for example, 
												discharged to home, expired, and not available								0%
		Admission source			Nominal		Integer identifier corresponding to 21 distinct values, for example, 
												physician referral, emergency room, and transfer from a hospital			0%
		Time in hospital			Numeric		Integer number of days between admission and discharge						0%
		Payer code					Nominal		Integer identifier corresponding to 23 distinct values, for example, 
												Blue Cross/Blue Shield, Medicare, and self-pay								52%
		Medical specialty			Nominal		Integer identifier of a specialty of the admitting physician, 
												corresponding to 84 distinct values, for example, cardiology, 
												internal medicine, family/general practice, and surgeon						53%
		Number of lab procedures	Numeric		Number of lab tests performed during the encounter							0%
		Number of procedures		Numeric		Number of procedures (other than lab tests) performed during the encounter	0%
		Number of medications		Numeric		Number of distinct generic names administered during the encounter			0%
		Number of outpatient visits	Numeric		Number of outpatient visits of the patient in the year preceding 
												the encounter																0%
		Number of emergency visits	Numeric		Number of emergency visits of the patient in the year preceding the 
												encounter																	0%
		Number of inpatient visits	Numeric		Number of inpatient visits of the patient in the year preceding the 
												encounter																	0%
		Diagnosis 1					Nominal		The primary diagnosis (coded as first three digits of ICD9); 
												848 distinct values															0%
		Diagnosis 2					Nominal		Secondary diagnosis (coded as first three digits of ICD9); 
												923 distinct values															0%
		Diagnosis 3					Nominal		Additional secondary diagnosis (coded as first three digits of ICD9); 
												954 distinct values															1%
		Number of diagnoses			Numeric		Number of diagnoses entered to the system									0%
		Glucose serum test result	Nominal		Indicates the range of the result or if the test was not taken. 
												Values: “>200,” “>300,” “normal,” and “none” if not measured				0%
		A1c test result				Nominal		Indicates the range of the result or if the test was not taken. 
												Values: “>8” if the result was greater than 8%, “>7” if the result was 
												greater than 7% but less than 8%, “normal” if the result was less than 7%, 
												and “none” if not measured.													0%
		Change of medications		Nominal		Indicates if there was a change in diabetic medications 
												(either dosage or generic name). Values: “change” and “no change”			0%
		Diabetes medications		Nominal		Indicates if there was any diabetic medication prescribed. Values: 
												“yes” and “no”																0%
		24 features for medications	Nominal		For the generic names: metformin, repaglinide, nateglinide, chlorpropamide, 
												glimepiride, acetohexamide, glipizide, glyburide, tolbutamide, 
												pioglitazone, rosiglitazone, acarbose, miglitol, troglitazone, 
												tolazamide, examide, sitagliptin, insulin, glyburide-metformin, 
												glipizide-metformin, glimepiride-pioglitazone, metformin-rosiglitazone, 
												and metformin-pioglitazone, 
												the feature indicates whether the drug was prescribed or there was a change 
												in the dosage. Values: “up” if the dosage was increased during the 
												encounter, “down” if the dosage was decreased, “steady” if the dosage 
												did not change, and “no” if the drug was not prescribed						0%
		Readmitted					Nominal		Days to inpatient readmission. Values: “<30” if the patient was readmitted 
												in less than 30 days, “>30” if the patient was readmitted in more than 
												30 days, and “No” for no record of readmission.								0%
	
	
	35.5 Exploring the Dataset
	
		35.5.1 Load input data into Spark DataFrame
		
			val inDiaDataDF = spark.read.option("header", true).csv("file:///root/dataset_diabetes/diabetic_data.csv").cache()
			
		35.5.2 Display the schema of the DataFrame created	
		
			inDiaDataDF.printSchema()
			
		35.5.3 Print out a few sample records to get a high-level sense of the values	
		
			inDiaDataDF.take(5).foreach(println)
			
		35.5.4 Compute the basic statistics for numerical columns	
			
			Display the count, mean, standard deviation, and min and max values of a few numeric data columns	
			
			inDiaDataDF.select("num_lab_procedures", "num_procedures", "num_medications", "number_diagnoses").describe().show()
			
		35.5.4 The original input Dataset contains incomplete, redundant, and noisy information, as expected in any real-world Dataset. 
			There are several fields that have a high percentage of missing values.
			
			Compute the number of records that have specific fields missing
			
			inDiaDataDF.select($"weight").groupBy($"weight").count().select($"weight", (($"count" / inDiaDataDF.count())*100).alias("percent_recs")).where("weight = '?'").show()
			
			inDiaDataDF.select($"payer_code").groupBy($"payer_code").count().select($"payer_code", (($"count" / inDiaDataDF.count())*100).alias("percent_recs")).where("payer_code = '?'").show()

			inDiaDataDF.select($"medical_specialty").groupBy($"medical_specialty").count().select($"medical_specialty", (($"count" / inDiaDataDF.count())*100).alias("percent_recs")).where("medical_specialty = '?'").show()

		35.5.5 Drop the weight and payer code columns, however, the medical specialty attribute ( potentially, a very relevant feature) is retained:		
		
			val diaDataDrpDF = inDiaDataDF.drop("weight", "payer_code")
			
		35.5.6 Dataset contains records of multiple inpatient visits by some of the patients	
		
			diaDataDrpDF.select($"patient_nbr").groupBy($"patient_nbr").count().where("count > 1").show(5)
			
			diaDataDrpDF.select($"patient_nbr").groupBy($"patient_nbr").count().where("count > 1").count()
			
		35.5.7 Only include only the first encounter for each patient
		
			val w = Window.partitionBy($"patient_nbr").orderBy($"encounter_id".desc)
			val diaDataSlctFirstDF = diaDataDrpDF.withColumn("rn", row_number.over(w)).where($"rn" === 1).drop("rn")

			diaDataSlctFirstDF.select($"patient_nbr").groupBy($"patient_nbr").count().where("count > 1").show()
			diaDataSlctFirstDF.count()
			
		35.5.8 Remove records of encounters that resulted in a patient's death to avoid bias
		
			val diaDataAdmttedDF = diaDataSlctFirstDF.filter($"discharge_disposition_id" =!= "11")
			diaDataAdmttedDF.count()
			
			
		35.5.9 Define the schema

			val admTypeId = StructField("admTypeId", DataTypes.IntegerType);
			val admType = StructField("description", DataTypes.StringType);
			val fields = Array(admTypeId, admType);
			val schema = StructType(fields);

		35.5.10 Load input data into Spark DataFrame

			val admTypeDF = spark.read.option("header", true).schema(schema).csv("file:///root/dataset_diabetes/admission_type.csv")
		
		35.5.11	Print out Schema
		
			admTypeDF.printSchema()
			
		35.5.12 Print out a few sample records
			
			admTypeDF.take(5).foreach(println)

		35.5.13 Define the schema

			val admSrcId = StructField("admSrcId", DataTypes.IntegerType);
			val admSrc = StructField("description", DataTypes.StringType);
			val fields = Array(admSrcId, admSrc);
			val schema = StructType(fields);

		35.5.14 Load input data into Spark DataFrame

			val admSrcDF = spark.read.option("header", true).csv("file:///root/dataset_diabetes/admission_source.csv")
		
		35.5.15	Print out Schema
		
			admSrcDF.printSchema()
			
		35.5.16 Print out a few sample records
			
			admSrcDF.take(5).foreach(println)

		35.5.17 Define the schema

			val dchrgDispId = StructField("dchrgDispId", DataTypes.IntegerType);
			val dchrgDisp = StructField("description", DataTypes.StringType);
			val fields = Array(dchrgDispId, dchrgDisp);
			val schema = StructType(fields);

		35.5.18 Load input data into Spark DataFrame

			val dchrgDispDF = spark.read.option("header", true).schema(schema).csv("file:///root/dataset_diabetes/discharge_disposition.csv")
		
		35.5.19	Print out Schema
		
			dchrgDispDF.printSchema()
			
		35.5.20 Print out a few sample records
			
			dchrgDispDF.take(5).foreach(println)

	
		35.5.21 Execute a set of JOIN operations to understand the data better in terms of top categories

			val joinDF = diaDataAdmttedDF.join(dchrgDispDF, diaDataAdmttedDF("discharge_disposition_id") === dchrgDispDF("dchrgDispId")).withColumnRenamed("description", "dchrgDisp").drop(dchrgDispDF("dchrgDispId")).join(admTypeDF, diaDataAdmttedDF("admission_type_id") === admTypeDF("admTypeId")).withColumnRenamed("description", "admType").drop(admTypeDF("admTypeId")).join(admSrcDF, diaDataAdmttedDF("admission_source_id") === admSrcDF("admission_source_id")).withColumnRenamed("description", "admission_source").drop(admSrcDF("admission_source_id"))
			joinDF.select("encounter_id", "dchrgDisp", "admType", "admission_source").show(5)
			joinDF.select("encounter_id", "dchrgDisp").groupBy("dchrgDisp").count().orderBy($"count".desc).take(5).foreach(println)
			joinDF.select("encounter_id", "admType").groupBy("admType").count().orderBy($"count".desc).take(5).foreach(println)
			joinDF.select("encounter_id", "admission_source").groupBy("admission_source").count().orderBy($"count".desc).take(5).foreach(println)
		
	35.6 Pre-processing the data

			Addressing the missing field, values. 
				1. drop them using df.na.drop()
				2. fill them with default values using df.na.fill()
				3. fields can be replaced with the most commonly occurring values for that column
				4. numeric fields they can also be replaced with average values
				5. train a regression model on the column and use it to predict the field values for rows where values are missing
				
		35.6.1 Use the df.na.replace() function to replace ? with the "Missing" string
		
			diaDataAdmttedDF.select("medical_specialty").where("medical_specialty = '?'").groupBy("medical_specialty").count().show()
			
			val diaDataRplcMedSplDF = diaDataAdmttedDF.na.replace("medical_specialty", Map("?" -> "Missing"))
			
		35.6.2	Could have created a new binary feature called has_ medical_specialty and assigned it a value of 1 when a row contained the
			value and 0 when it was unknown or missing. Alternatively, we could also have created a binary feature for each value of medical_specialty, 
			such as Is_Cardiology, Is_Surgeon, and Is_Missing.
			
		35.6.3 drop a set of columns from further analysis, to keep the size of the problem reasonable	
			
			val diaDataDrpColsDF = diaDataRplcMedSplDF.drop("encounter_id", "patient_nbr", "diag_2", "diag_3", "max_glu_serum", "metformin", "repaglinide", "nateglinide", "chlorpropamide", "glimepiride", "acetohexamide", "glipizide", "glyburide", "tolbutamide", "pioglitazone", "rosiglitazone", "acarbose", "miglitol", "troglitazone", "tolazamide", "examide", "citoglipton", "insulin", "glyburide-metformin", "glipizide-metformin", "glimepiride-pioglitazone", "metformin-rosiglitazone", "metformin-pioglitazone")
			
		35.6.4 Consider four groups of encounters: 
			no HbA1c test performed
			HbA1c performed and in normal range
			HbA1c performed and the result is greater than 8 percent
			HbA1c performed and the result is greater than 7 percent	
			
			diaDataDrpColsDF.groupBy($"A1Cresult").count().show()
			
		35.6.5 Define the function to categorize AlCresult
		
			def udfA1CGrps() = udf[Double, String] { a => val x = a match { case "None" => 1.0; case ">8" => 2.0; case ">7" => 3.0; case "Norm" => 4.0;}; x;} 

			val diaDataA1CResultsDF = diaDataDrpColsDF.withColumn("A1CResGrp", udfA1CGrps()($"A1Cresult"))

			diaDataA1CResultsDF.groupBy("A1CResGrp").count().withColumn("Percent_of_Population", ($"count" / diaDataA1CResultsDF.count())*100).withColumnRenamed("count", "Num_of_Encounters").show()

		35.6.6 Create a new ordinal feature, called Readmitted, with two values: Readmitted and Not Readmitted	

			def udfReAdmBins() = udf[String, String] { a => val x = a match { case "<30" => "Readmitted"; case "NO" => "Not Readmitted"; case ">30" => "Not Readmitted";}; x;}

			val diaDataReadmtdDF = diaDataA1CResultsDF.withColumn("Readmitted", udfReAdmBins()($"readmitted"))

			
		35.6.7 Display the numbers of several features versus the values of the target variable, as follows. This will help identify skews in the number of records based on various attributes
			in the input Dataset	
			
			diaDataReadmtdDF.groupBy("race").pivot("Readmitted").agg(count("Readmitted")).show()

			diaDataReadmtdDF.groupBy("A1CResGrp").pivot("Readmitted").agg(count("Readmitted")).orderBy("A1CResGrp").show()
			
			diaDataReadmtdDF.groupBy("gender").pivot("Readmitted").agg(count("Readmitted")).show()
			
		35.6.8 Group the various age ranges into various categories and adds it as a column to obtain our final version of the Dataset	
				Remove the three rows where the gender is Unknown/Invalid		
		
			def udfAgeBins() = udf[String, String] { a => val x = a match { case "[0-10)" => "Young"; case "[10-20)" => "Young"; case "[20-30)" => "Young"; case "[30-40)" => "Middle"; case "[40-50)" => "Middle"; case "[50-60)" => "Middle"; case "[60-70)" => "Elder";  case "[70-80)" => "Elder"; case "[80-90)" => "Elder"; case "[90-100)" => "Elder";}; x;}
			val diaDataAgeBinsDF = diaDataReadmtdDF.withColumn("age_category", udfAgeBins()($"age"))
			val diaDataRmvGndrDF = diaDataAgeBinsDF.filter($"gender" =!= "Unknown/Invalid")

			
		35.6.9 Final DataFrame
		
			val diaDataFinalDF = diaDataRmvGndrDF.select($"race", $"gender", $"age_category", $"admission_type_id".cast(IntegerType), $"discharge_disposition_id".cast(IntegerType), $"admission_source_id".cast(IntegerType), $"time_in_hospital".cast(IntegerType), $"num_lab_procedures".cast(DoubleType), $"num_procedures".cast(IntegerType), $"num_medications".cast(IntegerType), $"number_outpatient".cast(IntegerType), $"number_emergency".cast(IntegerType), $"number_inpatient".cast(IntegerType), $"diag_1", $"number_diagnoses".cast(IntegerType), $"A1CResGrp", $"change", $"diabetesMed", $"Readmitted").withColumnRenamed("age_category", "age")
			
			diaDataFinalDF.printSchema()
			diaDataFinalDF.take(5).foreach(println)

			
	35.7 Building the Spark ML pipeline

		35.7.1 Using StringIndexer for indexing categorical features and labels
		
			35.7.1.1 Use a StringIndexer to transform String features to Double values
					raceIndexer is an estimator that transforms the race column
	
				The fit() method then converts the column into a StringType and counts the numbers of each race
			
					val raceIndexer = new StringIndexer().setInputCol("race").setOutputCol("raceCat").fit(diaDataFinalDF)

				The transform() assigns the generated index to each value of the race in the column	
				
					raceIndexer.transform(diaDataFinalDF).select("race", "raceCat").show()

					raceIndexer.transform(diaDataFinalDF).select("race", "raceCat").groupBy("raceCat").count().show()

				val raceIndexer = new StringIndexer().setInputCol("race").setOutputCol("raceCat").fit(diaDataFinalDF)

				val rDF = raceIndexer.transform(diaDataFinalDF)

			35.7.1.2 Create indexers for the gender, age groups, HbA1c test results, change of medications, and diabetes medications prescribed, and fit them to the resulting DataFrames
				at each step
				
				val genderIndexer = new StringIndexer().setInputCol("gender").setOutputCol("genderCat").fit(rDF)
				val gDF = genderIndexer.transform(rDF)

				val ageCategoryIndexer  = new StringIndexer().setInputCol("age").setOutputCol("ageCat").fit(gDF)
				val acDF = ageCategoryIndexer.transform(gDF)

				val A1CresultIndexer  = new StringIndexer().setInputCol("A1CResGrp").setOutputCol("A1CResGrpCat").fit(acDF)
				val a1crDF = A1CresultIndexer.transform(acDF)

				val changeIndexer  = new StringIndexer().setInputCol("change").setOutputCol("changeCat").fit(a1crDF)
				val cDF = changeIndexer.transform(a1crDF)

				val diabetesMedIndexer  = new StringIndexer().setInputCol("diabetesMed").setOutputCol("diabetesMedCat").fit(cDF)
				val dmDF = diabetesMedIndexer.transform(cDF)

			35.7.1.3 Print out schema of the resulting DataFrame containing the columns for various indexers	
				
				dmDF.printSchema()

			35.7.1.4 Index the labels using StringIndexer	
			
				val labelIndexer = new StringIndexer().setInputCol("Readmitted").setOutputCol("indexedLabel")
				
			35.7.1.5 Alternatively, can also define our feature indexers as illustrated. The
					sequence of StringIndexers can then be concatenated with the numeric features to derive
					the features vector using a VectorAssembler	

				val catFeatColNames = Seq("race", "gender", "age", "A1CResGrp", "change", "diabetesMed")
				val stringIndexers = catFeatColNames.map { colName =>
						new StringIndexer()
							.setInputCol(colName)
							.setOutputCol(colName + "Cat")
							.fit(diaDataFinalDF)
					}	
					
		35.7.2 Using VectorAssembler for assembling features into one column section

			
			35.7.2.1 Due to the significant skew in the number of records for each label, so sample the records in
				appropriate proportions to have nearly equal numbers of records for each label
				
				val dataDF = dmDF.stat.sampleBy("Readmitted", Map("Readmitted" -> 1.0, "Not Readmitted" -> .030), 0)
	
			35.7.2.2 Assemble all our feature columns into a single column containing a vector that groups all our features
	
				val assembler = new VectorAssembler().setInputCols(Array("num_lab_procedures", "num_procedures", "num_medications", "number_outpatient", "number_emergency", "number_inpatient", "number_diagnoses", "admission_type_id", "discharge_disposition_id", "admission_source_id", "time_in_hospital", "raceCat", "genderCat", "ageCat", "A1CResGrpCat", "changeCat", "diabetesMedCat")).setOutputCol("features")
	
				Alternatively, also can do as the following
				
				val numFeatNames = Seq("num_lab_procedures", "num_procedures", "num_medications", "number_outpatient", "number_emergency", "number_inpatient", "number_diagnoses", "admission_type_id", "discharge_disposition_id", "admission_source_id", "time_in_hospital")
				val catFeatNames = catFeatColNames.map(_ + "Cat")
				val allFeatNames = numFeatNames ++ catFeatNames
				val assembler = new VectorAssembler().setInputCols(Array(allFeatNames: _*)).setOutputCol("features")
			
			35.7.2.3 Apply the transform() operation and print a few sample records of the resulting DataFrame
			
				val df2 = assembler.transform(dataDF)
				
				df2.select("Readmitted", "features").take(5).foreach(println)
				
			35.7.2.4 VectorIndexer is used for indexing the features.

				val featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(df2)
				
		35.7.3 Using a Spark ML classifier

			35.7.3.1 Create a RandomForestClassifier component
			
				val rf = new RandomForestClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setNumTrees(10)
				
			35.7.3.2 Create a DecisionTreeClassifier component
			
				val dt = new DecisionTreeClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures")
				
			35.7.3.3 Create a GBTClassifier component
				
				val gbt = new GBTClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(10)
			

		35.7.4 Creating a Spark ML pipeline

			val pipeline = new Pipeline().setStages(Array(labelIndexer, featureIndexer, rf))
		
		
		35.7.5 Creating the training and test Datasets
		
			val Array(trainingData, testData) = df2.randomSplit(Array(0.8, 0.2), 11L)
			
		35.7.6 Use the pipeline to fit the training data. A PipelineModel object is returned as a result of fitting the pipeline to the training data
		
			val model = pipeline.fit(trainingData)
			
			
		35.7.7 Making predictions using the PipelineModel	
		
			val predictions = model.transform(testData)
			predictions.select("prediction", "indexedLabel", "features").show(25)
			val predictionAndLabels = predictions.select("prediction","indexedLabel").rdd.map { row =>
					(row.get(0).asInstanceOf[Double],row.get(1).asInstanceOf[Double])
			}
			
		35.7.8 Evaluate our model by measuring the accuracy of the predictions	
		
			val evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("accuracy")
			val accuracy = evaluator.evaluate(predictions)
			println("Test Error = " + (1.0 - accuracy))
			
		35.7.9 Print our random forest model to understand the logic being used in the ten trees created in our model
		
			val rfModel = model.stages(2).asInstanceOf[RandomForestClassificationModel]
			println("Learned classification forest model:\n" + rfModel.toDebugString)
			
		35.7.10 Selecting the best model

			For each combination of parameters, perform cross-validation and retain the best model according to some performance indicator
			
			val paramGrid = new ParamGridBuilder().addGrid(rf.maxBins, Array(25, 28, 31)).addGrid(rf.maxDepth, Array(4, 6, 8)).addGrid(rf.impurity, Array("entropy", "gini")).build()
			val evaluator = new BinaryClassificationEvaluator().setLabelCol("indexedLabel")
			val cv = new CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(2)
			val crossValidatorModel = cv.fit(df2)
			val predictions = crossValidatorModel.transform(testData)
			predictions.select("prediction", "indexedLabel", "features").show(25)
			val accuracy = evaluator.evaluate(predictions)
			println("Test Error = " + (1.0 - accuracy))
		
		35.7.11 Changing the ML algorithm in the pipeline
		
			35.7.11.1 Replace the random forest classifier with a logistic regression model
			
			val lr = new LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8).setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures")
			val pipeline = new Pipeline().setStages(Array(labelIndexer, featureIndexer, lr))
			val Array(trainingData, testData) = df2.randomSplit(Array(0.8, 0.2), 11L)
			val model = pipeline.fit(trainingData)
			val predictions = model.transform(testData)
			predictions.select("A1CResGrpCat", "indexedLabel", "prediction").show()
			predictions.select($"indexedLabel", $"prediction").where("indexedLabel != prediction").count()
			
	35.8 Spark ML tools and utilities

		35.8.1 Using Principal Component Analysis to select features
		
			PCA is a statistical procedure that converts a set of potentially correlated variables into a, typically, reduced set of linearly uncorrelated variables
			
			val pca = new PCA().setInputCol("features").setOutputCol("pcaFeatures").setK(3).fit(df2)
			val result = pca.transform(df2).select("pcaFeatures")
			result.take(5).foreach(println)
			
		35.8.2 Using encoders

			one-hot encoding to map a column of label indices to a column of binary vectors with, at most, a single one-value. This encoding allows algorithms that
			expect continuous features, such as LogisticRegression, to use categorical features
		
			val indexer = new StringIndexer().setInputCol("race").setOutputCol("raceIndex").fit(df2)
			val indexed = indexer.transform(df2)
			val encoder = new OneHotEncoder().setInputCol("raceIndex").setOutputCol("raceVec")
			val encoded = encoder.transform(indexed)
			encoded.select("race","raceIndex","raceVec").show()
			encoded.select("race","raceIndex","raceVec").take(50).foreach(println)

		35.8.3 Using Bucketizer

			Bucketizer is used to transform a column of continuous features to a column of feature buckets.	
			
			val splits = Array(Double.NegativeInfinity, 20.0, 40.0, 60.0, 80.0, 100.0, Double.PositiveInfinity)
			val bucketizer = new Bucketizer().setInputCol("num_lab_procedures").setOutputCol("bucketedLabProcs").setSplits(splits)
			val bucketedData = bucketizer.transform(df2)
			println(s"Bucketizer output with ${bucketizer.getSplits.length-1} buckets")
			bucketedData.select("num_lab_procedures","bucketedLabProcs").show()
			
		35.8.4 Using VectorSlicer

			A VectorSlicer is a Transformer that takes a feature vector and returns a new feature vector that is a subset of the original features. It is useful for extracting features from a
			vector column
			
			val slicer = new VectorSlicer().setInputCol("features").setOutputCol("slicedfeatures").setNames(Array("raceCat", "genderCat", "ageCat", "A1CResGrpCat"))
			val output = slicer.transform(df2)
			output.select("slicedFeatures").take(5).foreach(println)
			
			val slicer = new VectorSlicer().setInputCol("features").setOutputCol("slicedfeatures").setNames(Array("raceCat", "genderCat", "ageCat"))
			val output = slicer.transform(df2)
			output.select("slicedFeatures").take(5).foreach(println)
			
		35.8.5 Using Chi-squared selector

			ChiSqSelector enables chi-squared feature selection. It operates on labeled data with categorical features. ChiSqSelector uses the chi-squared test of independence to choose
			the features.
		
			def udfReAdmLabels() = udf[Double, String] { a => val x = a match { case "Readmitted" => 1.0; case "Not Readmitted" => 0.0;}; x;}
			val df3 = df2.withColumn("reAdmLabel", udfReAdmLabels()($"Readmitted"))
			val selector = new ChiSqSelector().setNumTopFeatures(1).setFeaturesCol("features").setLabelCol("reAdmLabel").setOutputCol("selectedFeatures")
			val result = selector.fit(df3).transform(df3)
			println(s"ChiSqSelector output with top ${selector.getNumTopFeatures} features selected")
			result.select("features", "reAdmLabel", "selectedFeatures").show()
			
		35.8.6 Using a Normalizer

			val normalizer = new Normalizer().setInputCol("raw_features ").setOutputCol("features")		
			
		35.8.7 Retrieving our original labels

			IndexToString is the reverse operation of StringIndexer that converts the indices back to their original labels.
			
			val labelIndexer = new StringIndexer().setInputCol("Readmitted").setOutputCol("indexedLabel").fit(df2)
			val featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(df2)
			val Array(trainingData, testData) = df2.randomSplit(Array(0.7, 0.3))
			val gbt = new GBTClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(10)
			val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
			val pipeline = new Pipeline().setStages(Array(labelIndexer, featureIndexer, gbt, labelConverter))

			val model = pipeline.fit(trainingData)

			val predictions = model.transform(testData)
			predictions.select("predictedLabel", "indexedLabel", "features").show(5)